library(ggplot2)
library(ggthemes)
library(dplyr)
library(viridis)
library(tidyr)
library(cluster)
library(ggmap)
library(maps)
Load the Dataset and Check for duplicated records
toronto <- read.csv('data/Major_Crime_Indicators_Open_Data.csv')
head(toronto)
## X Y OBJECTID EVENT_UNIQUE_ID REPORT_DATE
## 1 -79.42590 43.75735 1 GO-20141262074 2014/01/01 05:00:00+00
## 2 -79.35023 43.64629 2 GO-20141260701 2014/01/01 05:00:00+00
## 3 -79.37650 43.66642 3 GO-20141260889 2014/01/01 05:00:00+00
## 4 -85.48874 0.00000 4 GO-20141260973 2014/01/01 05:00:00+00
## 5 -79.34484 43.67895 5 GO-20141261050 2014/01/01 05:00:00+00
## 6 -79.39184 43.64664 6 GO-20141259344 2014/01/01 05:00:00+00
## OCC_DATE REPORT_YEAR REPORT_MONTH REPORT_DAY REPORT_DOY
## 1 1998/06/01 04:00:00+00 2014 January 1 1
## 2 2014/01/01 05:00:00+00 2014 January 1 1
## 3 2014/01/01 05:00:00+00 2014 January 1 1
## 4 2014/01/01 05:00:00+00 2014 January 1 1
## 5 2014/01/01 05:00:00+00 2014 January 1 1
## 6 2014/01/01 05:00:00+00 2014 January 1 1
## REPORT_DOW REPORT_HOUR OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY OCC_DOW OCC_HOUR
## 1 Wednesday 12 NA NA NA 12
## 2 Wednesday 3 2014 January 1 1 Wednesday 3
## 3 Wednesday 4 2014 January 1 1 Wednesday 4
## 4 Wednesday 4 2014 January 1 1 Wednesday 4
## 5 Wednesday 4 2014 January 1 1 Wednesday 4
## 6 Wednesday 2 2014 January 1 1 Wednesday 2
## DIVISION LOCATION_TYPE
## 1 D32 Apartment (Rooming House, Condo)
## 2 D51 Commercial Dwelling Unit (Hotel, Motel, B & B, Short Term Rental)
## 3 D51 Apartment (Rooming House, Condo)
## 4 NSA Streets, Roads, Highways (Bicycle Path, Private Road)
## 5 D54 Streets, Roads, Highways (Bicycle Path, Private Road)
## 6 D52 Bar / Restaurant
## PREMISES_TYPE UCR_CODE UCR_EXT OFFENCE MCI_CATEGORY
## 1 Apartment 1480 110 Administering Noxious Thing Assault
## 2 Commercial 2120 200 B&E Break and Enter
## 3 Apartment 1430 100 Assault Assault
## 4 Outside 2130 210 Theft Over Theft Over
## 5 Outside 1430 100 Assault Assault
## 6 Commercial 1420 110 Assault Bodily Harm Assault
## HOOD_158 NEIGHBOURHOOD_158 HOOD_140 NEIGHBOURHOOD_140
## 1 38 Lansing-Westgate 38 Lansing-Westgate (38)
## 2 70 South Riverdale 70 South Riverdale (70)
## 3 74 North St.James Town 74 North St.James Town (74)
## 4 NSA NSA NSA NSA
## 5 69 Blake-Jones 66 Danforth (66)
## 6 164 Wellington Place 77 Waterfront Communities-The Island (77)
## LONG_WGS84 LAT_WGS84
## 1 -79.42590 43.75735
## 2 -79.35023 43.64629
## 3 -79.37650 43.66642
## 4 -85.48874 0.00000
## 5 -79.34484 43.67895
## 6 -79.39184 43.64664
sum(is.na(toronto))
## [1] 315
sapply(toronto, function(x) sum(is.na(x)))
## X Y OBJECTID EVENT_UNIQUE_ID
## 0 0 0 0
## REPORT_DATE OCC_DATE REPORT_YEAR REPORT_MONTH
## 0 0 0 0
## REPORT_DAY REPORT_DOY REPORT_DOW REPORT_HOUR
## 0 0 0 0
## OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY
## 105 0 105 105
## OCC_DOW OCC_HOUR DIVISION LOCATION_TYPE
## 0 0 0 0
## PREMISES_TYPE UCR_CODE UCR_EXT OFFENCE
## 0 0 0 0
## MCI_CATEGORY HOOD_158 NEIGHBOURHOOD_158 HOOD_140
## 0 0 0 0
## NEIGHBOURHOOD_140 LONG_WGS84 LAT_WGS84
## 0 0 0
missingCols <- select(toronto, OCC_YEAR, OCC_MONTH, OCC_DAY, OCC_DOY, OCC_DOW)
toronto <- na.omit(toronto)
(colMeans(is.na(missingCols)))*100
## OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY OCC_DOW
## 0.03247798 0.00000000 0.03247798 0.03247798 0.00000000
sum(duplicated(toronto$EVENT_UNIQUE_ID))
## [1] 41575
toronto <- subset(toronto, !duplicated(toronto$EVENT_UNIQUE_ID))
unique(toronto$OCC_YEAR)
## [1] 2014 2013 2012 2003 2011 2004 2010 2009 2008 2006 2000 2005 2002 2001 2015
## [16] 2007 2016 2017 2018 2019 2020 2021 2022
unique(toronto$REPORT_YEAR)
## [1] 2014 2015 2016 2017 2018 2019 2020 2021 2022
year_group <- group_by(toronto, OCC_YEAR)
crime_by_year <- summarise(year_group,
n = n())
crime_by_year
## # A tibble: 23 × 2
## OCC_YEAR n
## <int> <int>
## 1 2000 24
## 2 2001 17
## 3 2002 17
## 4 2003 12
## 5 2004 22
## 6 2005 19
## 7 2006 12
## 8 2007 25
## 9 2008 36
## 10 2009 54
## # ℹ 13 more rows
# Getting the Count of Each category of Crime
table(unlist(toronto$MCI_CATEGORY))
##
## Assault Auto Theft Break and Enter Robbery Theft Over
## 144916 41642 61089 23543 10426
drops <- c("X", "Y", "OBJECTID", "UCR_CODE", "UCR_EXT", "REPORT_DATE", "REPORT_MONTH", "REPORT_DAY", "REPORT_DOY", "REPORT_DOW", "REPORT_HOUR", "OCC_DOY", "REPORT_YEAR", "DIVISION", "HOOD_158", "HOOD_140")
toronto <- toronto[, !(names(toronto) %in% drops)]
What are the total number of crimes in each year.
cases_count <- toronto %>%
filter(OCC_YEAR >= 2014) %>%
count(OCC_YEAR) %>%
group_by(OCC_YEAR)
# Getting the Count of Number of Crimes each year
ggplot(cases_count, aes(x = OCC_YEAR, y = n, label=n)) +
geom_line() +
geom_point() +
geom_text(hjust=0, vjust=0) +
labs(x = "Year",
y = "Total Number of Criminal Cases throughout 2014 to 2022",
title = "Yearwise total Criminal Cases throughout 2014 to 2022")

What are the Major Crimes in each year.
# Grouping Major Crimes from 2014 to 2022
indicator_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), MCI_CATEGORY, OCC_YEAR)
crime_by_indicator <- summarise(indicator_group, n=n())
crime_by_indicator <- crime_by_indicator[order(crime_by_indicator$n, decreasing = TRUE),]
ggplot(aes(x = reorder(MCI_CATEGORY, n), y = n), data = crime_by_indicator) +
geom_bar(stat = 'identity', width = 0.5) +
geom_text(aes(label = n), stat = 'identity', data = crime_by_indicator, hjust = -0.1, size = 3.5) +
coord_flip() +
xlab('Major Crimes') +
ylab('Number of Occurrences') +
ggtitle('Major Crimes from 2014 to 2022') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

What are the different types of Assault and which one is the
worst.
# Getting the crimes from Assault Category for 2014
assault <- filter(toronto[toronto$MCI_CATEGORY == 'Assault', ], OCC_YEAR >= 2014, OCC_YEAR != "NA")
assault_group <- group_by(assault, OFFENCE, OCC_YEAR)
assault_by_offence <- summarise(assault_group, n=n())
assault_by_offence <- assault_by_offence[order(assault_by_offence$n, decreasing = TRUE), ]
ggplot(aes(x = reorder(OFFENCE, n), y = n), data = assault_by_offence) +
geom_bar(stat = 'identity', width = 0.6) +
geom_text(aes(label = n), stat = 'identity', data = assault_by_offence, hjust = -0.1, size = 3) +
coord_flip() +
xlab('Types of Assault') +
ylab('Number of Occurrences') +
ggtitle('Assault Crimes in 2014') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

Let’s look at the offences instead.
# Getting the Offences from 2014 to 2022
offence_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), OFFENCE, OCC_YEAR)
crime_by_offence <- summarise(offence_group, n=n())
crime_by_offence <- crime_by_offence[order(crime_by_offence$n, decreasing = TRUE), ]
ggplot(aes(x = reorder(OFFENCE, n), y = n), data = crime_by_offence) +
geom_bar(stat = 'identity', width = 0.7) +
geom_text(aes(label = n), stat = 'identity', data = crime_by_offence, hjust = -0.1, size = 2) +
coord_flip() +
xlab('Types of Offence') +
ylab('Number of Occurrences') +
ggtitle('Offence Types Toronto from 2014 to 2022') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

Checking for crimes wrt time of the day.
# Crimes WRT Time of Day from 2014 to 2022
hour_group <- group_by(filter(toronto, OCC_YEAR >= 2014, OCC_YEAR != "NA"), OCC_HOUR, OCC_YEAR)
crime_hour <- summarise(hour_group, n=n())
ggplot(aes(x=OCC_HOUR, y=n), data = crime_hour) + geom_line(size = 2.5, alpha = 0.7, color = "mediumseagreen", group=1) +
geom_point(size = 0.5) +
ggtitle('Total Crimes by Hour of Day in Toronto from 2014 to 2022') +
ylab('Number of Occurrences') +
xlab('Hour(24-hour clock)') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

What types of Crimes are most frequent wrt the time of the day?
# Crime Types WRT Hour of the day from 2014 to 2022
hour_crime_group <- group_by(filter(toronto, OCC_YEAR >= 2014), OCC_HOUR, OCC_YEAR, MCI_CATEGORY)
hour_crime <- summarise(hour_crime_group, n=n())
ggplot(aes(x=OCC_HOUR, y=n, color=MCI_CATEGORY), data = hour_crime) +
geom_line(size=1.5) +
ggtitle('Crime Types by Hour of Day in Toronto from 2014 to 2022') +
ylab('Number of Occurrences') +
xlab('Hour(24-hour clock)') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

Where in Toronto were those crimes most likely to
occur
# Crimes WRT location from 2014 to 2022
location_group <- group_by(filter(toronto, OCC_YEAR >= 2014), NEIGHBOURHOOD_158)
crime_by_location <- summarise(location_group, n=n())
crime_by_location <- crime_by_location[order(crime_by_location$n, decreasing = TRUE), ]
crime_by_location_top20 <- head(crime_by_location, 20)
ggplot(aes(x = reorder(NEIGHBOURHOOD_158, n), y = n), data = crime_by_location_top20) +
geom_bar(stat = 'identity', width = 0.6) +
geom_text(aes(label = n), stat = 'identity', data = crime_by_location_top20, hjust = -0.1, size = 3) +
coord_flip() +
xlab('Neighbourhoods') +
ylab('Number of Occurrences') +
ggtitle('Neighbourhoods with Most Crimes - Top 20 from 2014 to 2022') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold"))

Which are the Safest Neighborhoods to Live in 2022
location_group <- group_by(filter(toronto, OCC_YEAR == 2022), NEIGHBOURHOOD_158)
crime_by_location <- summarise(location_group, n=n())
crime_by_location <- crime_by_location[order(crime_by_location$n, decreasing = TRUE), ]
tail(crime_by_location, 5)
## # A tibble: 5 × 2
## NEIGHBOURHOOD_158 n
## <chr> <int>
## 1 Bendale South 71
## 2 Humber Heights-Westmount 71
## 3 Woodbine-Lumsden 66
## 4 Bayview Woods-Steeles 61
## 5 Guildwood 50
Comparing Neighborhoods with top offence types
# Top Offences in Each Neighborhood from in 2022
offence_location_group <- group_by(filter(toronto, OCC_YEAR == 2022), NEIGHBOURHOOD_158, OFFENCE, OCC_YEAR)
offence_type_by_location <- summarise(offence_location_group, n=n())
offence_type_by_location <- offence_type_by_location[order(offence_type_by_location$n, decreasing = TRUE), ]
offence_type_by_location_top20 <- head(offence_type_by_location, 50)
ggplot(aes(x = NEIGHBOURHOOD_158, y=n, fill = OFFENCE), data=offence_type_by_location_top20) +
geom_bar(stat = 'identity', position = position_dodge(), width = 0.8) +
xlab('Neighbourhood') +
ylab('Number of Occurrence') +
ggtitle('Offence Type vs. Neighbourhood Toronto in 2022') +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = .4)) +
facet_wrap(vars(OCC_YEAR))

Month Wise Crime Rates
# Major Crimes for every month from 2014 to 2022
crime_count <- filter(toronto, OCC_YEAR >= 2014) %>%
group_by(OCC_MONTH, MCI_CATEGORY, OCC_YEAR) %>%
summarise(Total = n())
crime_count$OCC_MONTH <- ordered(crime_count$OCC_MONTH, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'))
ggplot(crime_count, aes(OCC_MONTH, MCI_CATEGORY, fill = Total)) +
geom_tile(size = 1, color = "white") +
scale_fill_viridis() +
geom_text(aes(label=Total), color='white') +
ggtitle("Major Crime Indicators by Month from 2014 to 2022") +
xlab('Month') +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

Day of the Week Wise Crime Rates
# Major Crimes for every DOW from 2014 to 2022
day_count <- filter(toronto, OCC_YEAR >= 2014) %>%
group_by(OCC_DOW, MCI_CATEGORY, OCC_YEAR) %>%
summarise(Total = n())
ggplot(day_count, aes(OCC_DOW, MCI_CATEGORY, fill = Total)) +
geom_tile(size = 1, color = "white") +
scale_fill_viridis() +
geom_text(aes(label=Total), color='white') +
ggtitle("Major Crime Indicators by Day of Week from 2014 to 2022") +
xlab('Day of Week') +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold")) +
facet_wrap(vars(OCC_YEAR))

Homicide Rates
homicide <- read.csv('data/Homicides.csv', stringsAsFactors = F)
homicide <- filter(homicide, OCC_DATE >= 2014)
homicide$OCC_DATE <- as.Date(homicide$OCC_DATE)
year_group <- group_by(homicide, OCC_DATE, HOMICIDE_TYPE)
homicide_by_year <- summarise(year_group, n=n())
ggplot(aes(x = OCC_DATE, y=n, fill = HOMICIDE_TYPE), data = homicide_by_year) +
geom_bar(stat = 'identity', position = position_dodge(), width = 1) +
xlab('Year') +
ylab('Number of Homicides') +
ggtitle('Homicide 2014-2022') +
ylim(0, 5) +
theme_bw() +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold"))

homicide$month <- format(as.Date(homicide$OCC_DATE) , "%B")
homicide_count <- homicide %>% group_by(OCC_YEAR, month) %>% summarise(Total = n())
homicide_count$month <- ordered(homicide_count$month, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'))
ggplot(homicide_count, aes(OCC_YEAR, month, fill = Total)) +
geom_tile(size = 1, color = "white") +
scale_fill_viridis() +
geom_text(aes(label=Total), color='white') +
ggtitle("Homicides in Toronto (2014-2022)") +
xlab('Year') +
theme(plot.title = element_text(size = 16),
axis.title = element_text(size = 12, face = "bold"))

by_groups <- group_by(toronto, MCI_CATEGORY, NEIGHBOURHOOD_158)
groups <- summarise(by_groups, n=n())
groups <- groups[c("NEIGHBOURHOOD_158", "MCI_CATEGORY", "n")]
groups_wide <- spread(groups, key = MCI_CATEGORY, value = n)
groups_wide
## # A tibble: 159 × 6
## NEIGHBOURHOOD_158 Assault `Auto Theft` `Break and Enter` Robbery `Theft Over`
## <chr> <int> <int> <int> <int> <int>
## 1 Agincourt North 568 314 374 182 40
## 2 Agincourt South-… 947 351 648 173 103
## 3 Alderwood 277 193 241 49 57
## 4 Annex 1993 246 1363 288 271
## 5 Avondale 355 119 98 44 39
## 6 Banbury-Don Mills 656 236 543 80 88
## 7 Bathurst Manor 445 311 254 80 34
## 8 Bay-Cloverhill 975 81 488 156 100
## 9 Bayview Village 609 217 329 53 75
## 10 Bayview Woods-St… 289 133 223 26 21
## # ℹ 149 more rows
z <- groups_wide[, -c(1,1)]
z <- z[complete.cases(z), ]
m <- apply(z, 2, mean)
s <- apply(z, 2, sd)
z <- scale(z, m, s)
wss <- (nrow(z)-1) * sum(apply(z, 2, var))
for (i in 2:20) wss[i] <- sum(kmeans(z, centers=i)$withiness)
plot(1:20, wss, type='b', xlab='Number of Clusters', ylab='Within groups sum of squares')

kc <- kmeans(z, 2)
kc
## K-means clustering with 2 clusters of sizes 17, 142
##
## Cluster means:
## Assault Auto Theft Break and Enter Robbery Theft Over
## 1 2.1583428 1.0726446 2.1757297 2.0033547 2.2788979
## 2 -0.2583932 -0.1284152 -0.2604747 -0.2398382 -0.2728258
##
## Clustering vector:
## [1] 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 1
## [38] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 2 1 2
## [149] 2 2 2 2 2 1 2 2 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 229.2655 184.7964
## (between_SS / total_SS = 47.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
z1 <- data.frame(z, kc$cluster)
clusplot(z1, kc$cluster, color=TRUE, shade=F, labels=0, lines=0, main='k-Means Cluster Analysis')
